Main question: at this point we’re interested in one single classification, i.e. what predicts whether people do maskless contacts with non-householders
sessionInfo()
R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] rpart_4.1-15 rattle_5.4.0 bitops_1.0-7 tibble_3.0.4 doParallel_1.0.16 iterators_1.0.13 foreach_1.5.1 cvms_1.3.0
[9] tidyr_1.1.2 randomForest_4.6-14 caret_6.0-86 lattice_0.20-41 DataExplorer_0.8.2 faux_1.0.0 dplyr_1.0.2 magrittr_1.5
[17] parsnip_0.1.6 ggplot2_3.3.2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 lubridate_1.7.9.2 class_7.3-17 digest_0.6.27 ipred_0.9-9 R6_2.5.0 plyr_1.8.6 stats4_4.0.3
[9] evaluate_0.14 pillar_1.4.6 rlang_0.4.8 rstudioapi_0.13 data.table_1.14.0 Matrix_1.2-18 rmarkdown_2.5 splines_4.0.3
[17] gower_0.2.2 stringr_1.4.0 htmlwidgets_1.5.2 igraph_1.2.6 munsell_0.5.0 compiler_4.0.3 xfun_0.19 pkgconfig_2.0.3
[25] htmltools_0.5.0 nnet_7.3-14 tidyselect_1.1.0 gridExtra_2.3 prodlim_2019.11.13 codetools_0.2-16 crayon_1.3.4 withr_2.3.0
[33] MASS_7.3-53 recipes_0.1.15 ModelMetrics_1.2.2.2 grid_4.0.3 nlme_3.1-149 gtable_0.3.0 lifecycle_0.2.0 pROC_1.16.2
[41] scales_1.1.1 stringi_1.5.3 reshape2_1.4.4 timeDate_3043.102 ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.4 lava_1.6.8.1
[49] tools_4.0.3 glue_1.4.2 purrr_0.3.4 networkD3_0.4 survival_3.2-7 colorspace_2.0-0 knitr_1.30
df <- read.csv("data/shield_gjames_21-06-10.csv")
grouping_var <- "behaviour_unmasked"
# feature_list <- colnames(df[, !(names(df) %in% c(grouping_var, "id"))])
feature_list <- c('intention_indoor_meeting', 'norms_people_present_indoors',
'sdt_motivation_extrinsic_2', 'sdt_motivation_identified_4', 'norms_family_friends', 'norms_risk_groups', 'norms_officials',
'norms_people_present_indoors')
if (grouping_var == "behaviour_unmasked") {
# df <- df %>% mutate(tmp = if_else(!!as.symbol(grouping_var) != 5, 'bad', 'good'))
df <- df %>% mutate(tmp = if_else(!!as.symbol(grouping_var) != 5, 0, 1))
names(df)[names(df) == 'tmp'] <- paste0(grouping_var, "_bool")
}
df[, paste0(grouping_var, "_bool")] <- as.factor(df[, paste0(grouping_var, "_bool")])
# df %<>%
# mutate_each_(funs(factor(.)), colnames(df))
# str(df)
ordinal_vars_mydata <- ordering_lookup %>%
dplyr::filter(varname %in% names(df)) %>%
dplyr::filter(ordering == "ordered")
df <- df %>%
# Ordered variables as ordinal factors
dplyr::mutate(across(.cols = ordinal_vars_mydata$varname,
~factor(., ordered = TRUE))) %>%
# Everything else as unordered factors
dplyr::mutate(across(.cols = -ordinal_vars_mydata$varname,
~factor(.))) %>%
# Fix ordering in the intention variables
dplyr::mutate(across(.cols = contains("intention_"),
~dplyr::recode_factor(.,
"1" = "4",
"2" = "1",
"3" = "2",
"4" = "3",
.ordered = TRUE)))
str(df)
'data.frame': 2272 obs. of 94 variables:
$ id : Factor w/ 2272 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
$ demographic_gender : Factor w/ 2 levels "1","2": 1 2 1 1 1 2 2 2 1 1 ...
$ demographic_age : Ord.factor w/ 5 levels "18-29"<"30-39"<..: 4 2 1 5 5 4 1 2 5 5 ...
$ demographic_4_areas : Factor w/ 4 levels "1","2","3","4": 1 2 1 1 2 1 1 4 4 1 ...
$ demographic_8_areas : Factor w/ 8 levels "1","2","3","4",..: 2 6 2 2 7 1 2 6 6 7 ...
$ behaviour_indoors_nonhouseholders: Ord.factor w/ 6 levels "1"<"2"<"3"<"4"<..: 5 5 3 4 5 3 5 5 4 5 ...
$ behaviour_close_contact : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 4 4 2 3 4 3 4 4 4 3 ...
$ behaviour_quarantined : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 2 ...
$ behaviour_unmasked : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 5 5 2 2 4 3 5 3 4 5 ...
$ mask_wearing_cloth_mask : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 2 1 1 1 ...
$ mask_wearing_disposable_mask : Factor w/ 2 levels "0","1": 2 2 2 1 2 1 1 2 1 1 ...
$ mask_wearing_certified_mask : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
$ mask_wearing_ffp2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
$ mask_wearing_vizire : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ mask_wearing_none : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ mask_wearing_other : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
$ mask_wearing_reuse : Factor w/ 5 levels "1","2","3","4",..: 2 4 2 5 3 2 5 4 2 4 ...
$ intention_store : Ord.factor w/ 4 levels "4"<"1"<"2"<"3": 1 1 1 1 1 1 1 3 1 1 ...
$ intention_public_transport : Ord.factor w/ 4 levels "4"<"1"<"2"<"3": 4 1 1 1 1 1 1 4 1 1 ...
$ intention_indoor_meeting : Ord.factor w/ 4 levels "4"<"1"<"2"<"3": 1 3 3 2 2 3 3 3 2 1 ...
$ intention_restaurant : Ord.factor w/ 4 levels "4"<"1"<"2"<"3": 2 2 2 2 1 1 2 3 1 2 ...
$ intention_pa : Ord.factor w/ 4 levels "4"<"1"<"2"<"3": 2 2 4 4 3 2 4 3 2 4 ...
$ automaticity_carry_mask : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 3 5 6 7 6 7 1 5 6 ...
$ automaticity_put_on_mask : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 4 6 6 6 7 7 7 1 6 6 ...
$ post_covid_maskwearing_if_reccd : Factor w/ 4 levels "1","2","3","4": 3 4 4 3 1 1 4 4 1 1 ...
$ inst_attitude_protects_self : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 6 4 4 4 6 7 4 4 6 ...
$ inst_attitude_protects_others : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 7 7 6 7 6 7 4 6 6 ...
$ inst_attitude_sense_of_community : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 4 4 4 6 7 4 7 1 5 6 ...
$ inst_attitude_enough_oxygen : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 3 7 6 7 4 7 1 5 3 ...
$ inst_attitude_no_needless_waste : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 3 1 7 6 7 4 7 1 5 1 ...
$ norms_family_friends : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 7 7 6 7 7 7 1 4 7 ...
$ norms_risk_groups : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 5 7 4 6 7 7 7 2 7 7 ...
$ norms_officials : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 7 7 6 7 7 7 7 7 7 ...
$ norms_people_present_indoors : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 5 7 7 6 7 4 7 4 6 7 ...
$ aff_attitude_comfortable : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 4 5 5 3 4 6 1 5 2 ...
$ aff_attitude_calm : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 4 3 7 6 7 5 6 3 6 3 ...
$ aff_attitude_safe : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 3 4 5 5 4 5 7 5 6 5 ...
$ aff_attitude_responsible : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 6 7 5 7 6 7 4 7 6 ...
$ aff_attitude_difficult_breathing : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 4 1 3 2 5 2 6 5 5 ...
$ barriers_nothing : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 2 1 1 1 ...
$ barriers_money : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ barriers_forget_carry : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 1 ...
$ barriers_forget_wear : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 2 1 ...
$ barriers_group_pressure : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ barriers_medical_symptoms : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ barriers_skin : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
$ barriers_difficult_breathing : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 2 ...
$ barriers_eyeglasses_fog : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 1 2 1 2 ...
$ barriers_raspyvoice : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
$ barriers_headache : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ barriers_drymouth : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
$ barriers_earpain : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 1 ...
$ barriers_general_uncomfy : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 2 1 2 ...
$ barriers_other : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ effective_means_handwashing : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 1 7 5 7 6 7 7 7 7 ...
$ effective_means_masks : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 5 5 1 5 7 6 6 1 7 7 ...
$ effective_means_distance : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 4 1 7 7 5 5 7 7 7 ...
$ effective_means_ventilation : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 4 4 7 7 5 5 4 6 7 ...
$ risk_likely_contagion : Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 2 4 4 3 2 2 2 3 3 1 ...
$ risk_contagion_absent_protection : Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 6 5 6 5 6 5 6 3 6 4 ...
$ risk_severity : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 2 5 6 4 5 3 1 4 7 ...
$ risk_fear_spread : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 4 5 7 4 6 5 7 4 3 7 ...
$ risk_fear_contagion_self : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 3 5 6 4 5 3 3 4 7 ...
$ risk_fear_contagion_others : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 7 6 6 7 6 7 7 4 7 ...
$ risk_fear_restrictions : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 5 3 1 3 1 4 1 7 3 4 ...
$ sdt_needs_autonomy_1 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 2 3 5 3 5 2 5 2 4 2 ...
$ sdt_needs_competence_1 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 4 5 4 5 4 5 4 4 3 ...
$ sdt_needs_relatedness_1 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 5 1 4 5 4 5 1 5 4 ...
$ sdt_needs_autonomy_2 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 2 4 2 3 5 4 5 1 4 4 ...
$ sdt_needs_competence_2 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 5 5 2 4 4 5 3 4 3 ...
$ sdt_needs_relatedness_2 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 2 4 3 5 4 5 2 5 4 ...
$ sdt_motivation_extrinsic1 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 1 1 2 1 2 2 1 4 1 ...
$ sdt_motivation_amotivation_1 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 1 1 1 2 1 1 1 5 2 1 ...
$ sdt_motivation_identified_1 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 7 5 6 7 6 7 4 7 7 ...
$ sdt_motivation_introjected_1 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 5 5 1 3 6 3 5 1 6 6 ...
$ sdt_motivation_extrinsic_2 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 4 1 2 2 4 1 5 2 1 ...
$ sdt_motivation_introjected_2 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 5 1 5 6 5 7 1 6 4 ...
$ sdt_motivation_amotivation_2 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 1 1 1 1 1 2 1 5 1 1 ...
$ sdt_motivation_extrinsic_3 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 1 2 1 4 1 5 1 6 2 1 ...
$ sdt_motivation_identified_2 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 3 7 3 6 7 5 7 1 6 6 ...
$ sdt_motivation_identified_3 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 6 7 2 6 7 5 7 1 7 6 ...
$ sdt_motivation_identified_4 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 4 7 4 5 7 5 7 1 6 6 ...
$ sdt_motivation_amotivation_3 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 1 1 2 1 1 1 6 1 1 ...
$ sdt_motivation_introjected_3 : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 7 3 5 6 5 5 4 6 6 ...
$ attention_check : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 1 1 2 1 1 1 1 1 1 ...
$ vaccination_status_intention_self: Ord.factor w/ 5 levels "4"<"1"<"2"<"3"<..: 1 2 3 1 1 1 3 4 1 2 ...
$ vaccination_status_closeones : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 1 2 1 2 4 2 3 2 4 4 ...
$ covid_tested : Factor w/ 4 levels "1","2","3","4": 1 3 2 2 3 1 2 2 2 2 ...
$ had_covid : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 1 2 5 2 1 1 4 1 2 1 ...
$ demographic_risk_group : Factor w/ 3 levels "1","2","3": 2 2 2 1 2 2 2 2 2 3 ...
$ needprotection_before_shots : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 1 1 1 2 1 1 1 4 1 1 ...
$ needprotection_after_1_shot : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 2 2 1 2 1 2 1 4 1 1 ...
$ needprotection_after_2_shots : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 3 5 7 2 3 3 3 4 1 1 ...
$ behaviour_unmasked_bool : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 2 1 1 2 ...
# Exploratory data analysis
plot_intro(df)
plot_bar(df)
1 columns ignored with more than 50 categories.
id: 2272 categories
plot_correlation(df)
1 features with more than 20 categories ignored!
id: 2272 categories
head(df[, c(paste0(grouping_var, "_bool"), grouping_var)])
x <- df %>%
select(-behaviour_unmasked_bool, -behaviour_unmasked, -id) %>%
as.data.frame()
y <- df$behaviour_unmasked_bool
set.seed(2021)
inTrain <- createDataPartition(y, p = .80, list = FALSE)[,1]
x_train <- x[ inTrain, ]
x_test <- x[-inTrain, ]
y_train <- y[ inTrain]
y_test <- y[-inTrain]
colnames(x_train)
[1] "demographic_gender" "demographic_age" "demographic_4_areas" "demographic_8_areas"
[5] "behaviour_indoors_nonhouseholders" "behaviour_close_contact" "behaviour_quarantined" "mask_wearing_cloth_mask"
[9] "mask_wearing_disposable_mask" "mask_wearing_certified_mask" "mask_wearing_ffp2" "mask_wearing_vizire"
[13] "mask_wearing_none" "mask_wearing_other" "mask_wearing_reuse" "intention_store"
[17] "intention_public_transport" "intention_indoor_meeting" "intention_restaurant" "intention_pa"
[21] "automaticity_carry_mask" "automaticity_put_on_mask" "post_covid_maskwearing_if_reccd" "inst_attitude_protects_self"
[25] "inst_attitude_protects_others" "inst_attitude_sense_of_community" "inst_attitude_enough_oxygen" "inst_attitude_no_needless_waste"
[29] "norms_family_friends" "norms_risk_groups" "norms_officials" "norms_people_present_indoors"
[33] "aff_attitude_comfortable" "aff_attitude_calm" "aff_attitude_safe" "aff_attitude_responsible"
[37] "aff_attitude_difficult_breathing" "barriers_nothing" "barriers_money" "barriers_forget_carry"
[41] "barriers_forget_wear" "barriers_group_pressure" "barriers_medical_symptoms" "barriers_skin"
[45] "barriers_difficult_breathing" "barriers_eyeglasses_fog" "barriers_raspyvoice" "barriers_headache"
[49] "barriers_drymouth" "barriers_earpain" "barriers_general_uncomfy" "barriers_other"
[53] "effective_means_handwashing" "effective_means_masks" "effective_means_distance" "effective_means_ventilation"
[57] "risk_likely_contagion" "risk_contagion_absent_protection" "risk_severity" "risk_fear_spread"
[61] "risk_fear_contagion_self" "risk_fear_contagion_others" "risk_fear_restrictions" "sdt_needs_autonomy_1"
[65] "sdt_needs_competence_1" "sdt_needs_relatedness_1" "sdt_needs_autonomy_2" "sdt_needs_competence_2"
[69] "sdt_needs_relatedness_2" "sdt_motivation_extrinsic1" "sdt_motivation_amotivation_1" "sdt_motivation_identified_1"
[73] "sdt_motivation_introjected_1" "sdt_motivation_extrinsic_2" "sdt_motivation_introjected_2" "sdt_motivation_amotivation_2"
[77] "sdt_motivation_extrinsic_3" "sdt_motivation_identified_2" "sdt_motivation_identified_3" "sdt_motivation_identified_4"
[81] "sdt_motivation_amotivation_3" "sdt_motivation_introjected_3" "attention_check" "vaccination_status_intention_self"
[85] "vaccination_status_closeones" "covid_tested" "had_covid" "demographic_risk_group"
[89] "needprotection_before_shots" "needprotection_after_1_shot" "needprotection_after_2_shots"
# # Define the control using a random forest selection function
# control <- rfeControl(functions = rfFuncs, # random forest
# method = "repeatedcv", # or just cv
# repeats = 10, # number of repeats
# number = 10) # the number of folds
tictoc::tic()
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(2021)
# Specify 10 fold cross-validation
ctrl_cv <- trainControl(method = "repeatedcv",
search="grid",
number = 10,
repeats=10,
timingSamps = 5,
# seeds = c(1:101)
)
# Predict income using decision tree
dec_mod <- train(x=x_train,
y=y_train,
method = "rpartScore",
trControl = ctrl_cv,
tuneGrid = expand.grid(
cp = seq(0,1,0.1),
split = c("abs", "quad"),
prune = c("mc", "mr")
)
)
stopCluster(cl)
tictoc::toc()
1357.946 sec elapsed
registerDoSEQ()
varimp_data <- varImp(dec_mod)
varimp_data
rpartScore variable importance
only 20 most important variables shown (out of 91)
dec_mod$results
# Post prediction
postResample(predict(dec_mod, x_test), y_test)
Accuracy Kappa
0.7026432 0.3190151
prediction_tibble <- tibble("target"=y_test,
"prediction"=predict(dec_mod, x_test))
prediction_table <- table(prediction_tibble)
cfm <- as_tibble(prediction_table)
plot_confusion_matrix(cfm,
target_col = "target",
prediction_col = "prediction",
counts_col = "n")
'rsvg' is missing. Will not plot arrows and zero-shading.
fancyRpartPlot(dec_mod$finalModel)
Unrecognized rpart object: treating as a numeric response model
pred_df <- data.frame(target=as.numeric(y_test),
prediction=as.numeric(predict(dec_mod, x_test)),
row.names = rownames(x_test))
pred_df$correct_or_not <- pred_df$target + pred_df$prediction
zero_ids <- rownames(pred_df[pred_df[, "correct_or_not"] == 2,])
one_ids <- rownames(pred_df[pred_df[, "correct_or_not"] == 4,])
length(zero_ids)
[1] 249
length(one_ids)
[1] 70
df[zero_ids, ]
df[one_ids, ]
top_features <- rownames(head(varimp_data$importance, 3))
# top_features <- c("behaviour_indoors_nonhouseholders", "behaviour_close_contact", "intention_indoor_meeting")
# df$demographic_gender <- factor(df$demographic_gender)
# df <- data.frame(apply(df, 2, factor))
# df %<>%
# mutate_each_(funs(factor(.)),top_features)
# # str(df)
x <- df[top_features]
y <- factor(df$behaviour_unmasked_bool)
set.seed(2021)
inTrain <- createDataPartition(y, p = .80, list = FALSE)[,1]
x_train <- x[ inTrain, ]
x_test <- x[-inTrain, ]
y_train <- y[ inTrain]
y_test <- y[-inTrain]
colnames(x_train)
[1] "behaviour_indoors_nonhouseholders" "intention_store" "covid_tested"
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(2021)
# Specify 10 fold cross-validation
ctrl_cv <- trainControl(method = "repeatedcv",
search="grid",
number = 10,
repeats=10,
timingSamps = 5,
# seeds = c(1:101)
)
# Predict income using decision tree
dec_mod <- train(x=x_train,
y=y_train,
method = "rpartScore",
trControl = ctrl_cv,
tuneGrid = expand.grid(
cp = seq(0,1,0.1),
split = c("abs", "quad"),
prune = c("mc", "mr")
)
)
stopCluster(cl)
registerDoSEQ()
# Post prediction
postResample(predict(dec_mod, x_test), y_test)
Accuracy Kappa
0.7048458 0.3186542
prediction_tibble <- tibble("target"=y_test,
"prediction"=predict(dec_mod, x_test))
prediction_table <- table(prediction_tibble)
cfm <- as_tibble(prediction_table)
plot_confusion_matrix(cfm,
target_col = "target",
prediction_col = "prediction",
counts_col = "n")
'rsvg' is missing. Will not plot arrows and zero-shading.
NA
NA
fancyRpartPlot(dec_mod$finalModel)
Unrecognized rpart object: treating as a numeric response model
varImp(dec_mod)
rpartScore variable importance
ggplot(data=df, aes(x=id, y=intention_store, color=demographic_gender)) + geom_point()
ggplot(data=df, aes(x=id, y=behaviour_indoors_nonhouseholders, color=demographic_gender)) + geom_point()
dec_mod
CART or Ordinal Responses
1818 samples
3 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 1637, 1637, 1637, 1636, 1635, 1637, ...
Resampling results across tuning parameters:
cp split prune Accuracy Kappa
0.0 abs mc 0.7085242 0.3305165
0.0 abs mr 0.7085242 0.3305165
0.0 quad mc 0.7111667 0.3354733
0.0 quad mr 0.7111667 0.3354733
0.1 abs mc 0.7095697 0.3349478
0.1 abs mr 0.7095697 0.3349478
0.1 quad mc 0.7095697 0.3349478
0.1 quad mr 0.7095697 0.3349478
0.2 abs mc 0.7095697 0.3349478
0.2 abs mr 0.7095697 0.3349478
0.2 quad mc 0.7095697 0.3349478
0.2 quad mr 0.7095697 0.3349478
0.3 abs mc 0.6122135 0.0000000
0.3 abs mr 0.6122135 0.0000000
0.3 quad mc 0.6122135 0.0000000
0.3 quad mr 0.6122135 0.0000000
0.4 abs mc 0.6122135 0.0000000
0.4 abs mr 0.6122135 0.0000000
0.4 quad mc 0.6122135 0.0000000
0.4 quad mr 0.6122135 0.0000000
0.5 abs mc 0.6122135 0.0000000
0.5 abs mr 0.6122135 0.0000000
0.5 quad mc 0.6122135 0.0000000
0.5 quad mr 0.6122135 0.0000000
0.6 abs mc 0.6122135 0.0000000
0.6 abs mr 0.6122135 0.0000000
0.6 quad mc 0.6122135 0.0000000
0.6 quad mr 0.6122135 0.0000000
0.7 abs mc 0.6122135 0.0000000
0.7 abs mr 0.6122135 0.0000000
0.7 quad mc 0.6122135 0.0000000
0.7 quad mr 0.6122135 0.0000000
0.8 abs mc 0.6122135 0.0000000
0.8 abs mr 0.6122135 0.0000000
0.8 quad mc 0.6122135 0.0000000
0.8 quad mr 0.6122135 0.0000000
0.9 abs mc 0.6122135 0.0000000
0.9 abs mr 0.6122135 0.0000000
0.9 quad mc 0.6122135 0.0000000
0.9 quad mr 0.6122135 0.0000000
1.0 abs mc 0.6122135 0.0000000
1.0 abs mr 0.6122135 0.0000000
1.0 quad mc 0.6122135 0.0000000
1.0 quad mr 0.6122135 0.0000000
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were cp = 0, split = quad and prune = mc.
dec_mod$bestTune
dec_mod$finalModel
n= 1818
node), split, n, deviance, yval
* denotes terminal node
1) root 1818 705 1
2) behaviour_indoors_nonhouseholders=1,2,3,4,5 1421 418 1 *
3) behaviour_indoors_nonhouseholders=6 397 110 2
6) covid_tested=1 79 32 2 *
7) covid_tested=2,3,4 318 78 2
14) intention_store=1,2,3 27 13 2
28) intention_store=4,1,2 17 7 1 *
29) intention_store=3 10 3 2 *
15) intention_store=4 291 65 2 *